import pandas as pd
import numpy as np
import datetime as dt
import sys
import warnings
import IPython as ip
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
from scipy.stats import t, shapiro
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.graphics.gofplots import qqplot
# ACP
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn import decomposition
from sklearn import preprocessing
from IPython.display import display
import missingno as msno
# Configuration pour travail avec fichier python "tools" de fonctions
%load_ext autoreload
%aimport tools
# Recharger les modules pour la conception des fichiers tools
%autoreload 1
# Set option
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings("ignore")
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
On surveille :
Problématique : Les données du jeu de données peuvent-elles répondre aux objectifs ?
# Import données
data = pd.read_csv('assets/datas/df_app_knnImputer.csv', sep='\t',parse_dates=[2,3], low_memory=False)
df = data.copy()
# Visualisation d'un échantillon de la population
df.sample(5)
| code | creator | created_datetime | last_modified_datetime | product_name | brands | categories_fr | countries_fr | additives_n | additives_fr | ingredients_from_palm_oil_n | nutrition_grade_fr | main_category_fr | energy_100g | fat_100g | saturated_fat_100g | carbohydrates_100g | sugars_100g | fiber_100g | proteins_100g | salt_100g | sodium_100g | nutrition_score_fr_100g | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 22143 | 0030034086688 | usda-ndb-import | 2017-03-09 20:15:34 | 2017-03-09 20:15:34 | Butter Pecan | Giant Eagle, Giant Eagle Inc. | inconnu | États-Unis | 0.0 | 0.0 | d | inconnu | 1000.0 | 14.08 | 6.34 | 22.54 | 22.54 | 0.00 | 4.23 | 0.23368 | 0.092000 | 14.0 | |
| 143629 | 0856146004026 | usda-ndb-import | 2017-03-09 16:37:30 | 2017-03-09 16:37:30 | Marilyn's, Cheese Buttons | American Gra-Frutti Llc | inconnu | États-Unis | 2.0 | E300 - Acide ascorbique,E415 - Gomme xanthane | 0.0 | d | inconnu | 1644.0 | 25.00 | 16.07 | 28.57 | 0.00 | 0.00 | 14.29 | 0.81534 | 0.321000 | 17.0 |
| 24279 | 0032251149849 | usda-ndb-import | 2017-03-09 15:27:30 | 2017-03-09 15:27:30 | Assorted Fruitfuls Candy | Midwood Brands | inconnu | États-Unis | 4.0 | E330 - Acide citrique,E129 - Rouge allura AC,E... | 0.0 | d | inconnu | 1569.0 | 0.00 | 0.00 | 91.67 | 66.67 | 0.00 | 0.00 | 0.00000 | 0.000000 | 14.0 |
| 205686 | 4000856107133 | hangy | 2015-09-12 14:49:25 | 2017-03-07 17:53:50 | Radler Zitrone Alkoholfrei | Warsteiner | Boissons,Boissons alcoolisées,Bières,Boissons ... | Allemagne | 0.0 | 0.0 | e | Boissons | 148.0 | 0.00 | 0.00 | 7.70 | 7.70 | 0.48 | 0.30 | 0.01000 | 0.003937 | 11.0 | |
| 147008 | 0861258000101 | usda-ndb-import | 2017-03-09 15:59:13 | 2017-03-09 15:59:13 | Chedz, Premium Cheese Snacks, Spicy | Hall Brands | inconnu | États-Unis | 2.0 | E160b - Rocou,E375 - Acide nicotinique | 0.0 | e | inconnu | 2389.0 | 46.43 | 28.57 | 10.71 | 0.00 | 0.00 | 25.00 | 1.99644 | 0.786000 | 25.0 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 240304 entries, 0 to 240303 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 code 240304 non-null object 1 creator 240304 non-null object 2 created_datetime 240304 non-null datetime64[ns] 3 last_modified_datetime 240304 non-null datetime64[ns] 4 product_name 240304 non-null object 5 brands 240304 non-null object 6 categories_fr 240304 non-null object 7 countries_fr 240304 non-null object 8 additives_n 240304 non-null float64 9 additives_fr 240304 non-null object 10 ingredients_from_palm_oil_n 240304 non-null float64 11 nutrition_grade_fr 240304 non-null object 12 main_category_fr 240304 non-null object 13 energy_100g 240304 non-null float64 14 fat_100g 240304 non-null float64 15 saturated_fat_100g 240304 non-null float64 16 carbohydrates_100g 240304 non-null float64 17 sugars_100g 240304 non-null float64 18 fiber_100g 240304 non-null float64 19 proteins_100g 240304 non-null float64 20 salt_100g 240304 non-null float64 21 sodium_100g 240304 non-null float64 22 nutrition_score_fr_100g 240304 non-null float64 dtypes: datetime64[ns](2), float64(12), object(9) memory usage: 42.2+ MB
tools.get_description_variables(df,type_var='categ')
| count | unique | top | freq | first | last | |
|---|---|---|---|---|---|---|
| code | 240304 | 240304 | 0000000004530 | 1 | NaT | NaT |
| creator | 240304 | 2477 | usda-ndb-import | 153975 | NaT | NaT |
| created_datetime | 240304 | 126364 | 2017-03-09 10:37:09 | 19 | 2012-01-31 14:43:58 | 2017-04-20 21:13:06 |
| last_modified_datetime | 240304 | 119721 | 2015-08-09 17:35:48 | 22 | 2012-04-08 08:12:35 | 2017-04-21 00:53:41 |
| product_name | 240304 | 186889 | Extra Virgin Olive Oil | 192 | NaT | NaT |
| brands | 240304 | 46265 | inconnue | 3145 | NaT | NaT |
| categories_fr | 240304 | 16277 | inconnu | 178919 | NaT | NaT |
| countries_fr | 240304 | 81 | États-Unis | 155671 | NaT | NaT |
| additives_fr | 240304 | 38480 | 102639 | NaT | NaT | |
| nutrition_grade_fr | 240304 | 6 | d | 57653 | NaT | NaT |
| main_category_fr | 240304 | 2322 | inconnu | 178919 | NaT | NaT |
tools.get_description_variables(df,type_var='num')
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| additives_n | 240304.0 | 1.787436 | 2.464454 | 0.0 | 0.00000 | 1.00000 | 3.00000 | 31.000000 |
| ingredients_from_palm_oil_n | 240304.0 | 0.017582 | 0.133063 | 0.0 | 0.00000 | 0.00000 | 0.00000 | 2.000000 |
| energy_100g | 240304.0 | 1123.056095 | 786.012530 | 0.0 | 389.00000 | 1100.00000 | 1674.00000 | 3776.000000 |
| fat_100g | 240304.0 | 12.054291 | 16.700297 | 0.0 | 0.00000 | 4.60000 | 19.00000 | 100.000000 |
| saturated_fat_100g | 240304.0 | 4.614094 | 7.549919 | 0.0 | 0.00000 | 1.25000 | 6.67000 | 100.000000 |
| carbohydrates_100g | 240304.0 | 31.203502 | 28.476440 | 0.0 | 6.45000 | 20.00000 | 56.76000 | 100.000000 |
| sugars_100g | 240304.0 | 15.283422 | 20.688020 | 0.0 | 1.01000 | 5.10000 | 22.58000 | 100.000000 |
| fiber_100g | 240304.0 | 2.435193 | 4.211174 | 0.0 | 0.00000 | 1.02000 | 3.30000 | 100.000000 |
| proteins_100g | 240304.0 | 7.113039 | 8.106444 | 0.0 | 0.71000 | 4.88000 | 10.00000 | 100.000000 |
| salt_100g | 240304.0 | 1.592437 | 6.193162 | 0.0 | 0.06858 | 0.59182 | 1.37414 | 100.000000 |
| sodium_100g | 240304.0 | 0.626730 | 2.436071 | 0.0 | 0.02700 | 0.23300 | 0.54300 | 39.370079 |
| nutrition_score_fr_100g | 240304.0 | 8.959197 | 8.787331 | -15.0 | 1.00000 | 9.00000 | 15.80000 | 40.000000 |
add_per_year = df['code'].groupby(by=df['created_datetime'].dt.year).nunique()
modified_per_year = df['code'].groupby(by=df['last_modified_datetime'].dt.year).nunique()
fig=plt.figure(figsize=(12,8))
font_title = {'family': 'serif',
'color': '#114b98',
'weight': 'bold',
'size': 18,
}
sns.set_style("whitegrid")
plt.plot(add_per_year,
color="#114b98",
label="Ajouts")
plt.plot(modified_per_year,
color="#00afe6",
label="Modifications")
plt.title("Evolution des créations et modifications de produits par année",
fontdict=font_title)
plt.xlabel("Année")
plt.ylabel("Nombre de produits")
plt.legend()
plt.savefig("assets/graphiques/Evolutions_dates.jpg")
plt.show()
float_columns = df.select_dtypes(include=['float64']).columns.to_list()
object_columns = df.select_dtypes(include=['object']).columns.to_list()
datetime_columns = df.select_dtypes(include=['datetime64[ns]']).columns.to_list()
# On écarte les dates
# Variables numériques
cols_num = df.select_dtypes(include=[np.number]).columns.to_list()
# Variables quantitatives discrètes
cols_quant_discr = ['additives_n','ingredients_from_palm_oil_n','nutriscore_score_fr']
# Variables quantitatives continue
cols_quant_cont = ['energy_100g', 'fat_100g','saturated_fat_100g',
'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
'proteins_100g', 'salt_100g', 'sodium_100g']
# Mesures de tendances centrales des colonnes quantitatives continues
tools.stat_descriptives(df,cols_quant_cont)
| Desc | energy_100g | fat_100g | saturated_fat_100g | carbohydrates_100g | sugars_100g | fiber_100g | proteins_100g | salt_100g | sodium_100g |
|---|---|---|---|---|---|---|---|---|---|
| mean | 1123.056095 | 12.054291 | 4.614094 | 31.203502 | 15.283422 | 2.435193 | 7.113039 | 1.592437 | 0.626730 |
| median | 1100.000000 | 4.600000 | 1.250000 | 20.000000 | 5.100000 | 1.020000 | 4.880000 | 0.591820 | 0.233000 |
| var | 617813.126622 | 278.898761 | 57.001032 | 810.904287 | 427.992372 | 17.733913 | 65.714153 | 38.355100 | 5.934415 |
| std | 786.010895 | 16.700262 | 7.549903 | 28.476381 | 20.687977 | 4.211165 | 8.106427 | 6.193149 | 2.436065 |
| skew | 0.425871 | 2.231646 | 3.502017 | 0.623817 | 1.725949 | 5.409043 | 2.126997 | 11.073758 | 11.093051 |
| kurtosis | -0.446573 | 6.514148 | 22.325536 | -0.954933 | 2.479089 | 58.608099 | 8.571377 | 142.319727 | 142.787027 |
| mode | 0 0.0 | 0 0.0 | 0 0.0 | 0 0.0 | 0 0.0 | 0 0.0 | 0 0.0 | 0 0.0 | 0 0.0 |
| Min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| Max | 3776.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 39.370079 |
Si γ1=0 alors la distribution est symétrique.
Si γ1>0 alors la distribution est étalée à droite.
Si γ1<0 alors la distribution est étalée à gauche.
def var_hist(var, i):
subset = df[var]
n_df_valide = len(df)
xbar = np.mean(df[var]) # Moyenne
sprime = np.std(df[var], ddof=1) # Ecart-type
sprime2 = np.var(df[var], ddof=1) #Variance non biaisée
ax = fig.add_subplot(i)
ax.hist(subset, density=True)
ax.axvline(xbar, color='r', linewidth=2, label="Moyenne empirique")
bins = np.arange(df[var].min(),df[var].max(),0.05)
y = st.norm.pdf(bins, xbar, sprime)
ax.plot(bins, y, '--', label="Densité normale")
ax.legend()
ax.set_xlabel(var, fontsize=12)
ax.set_ylabel('Densité', fontsize=12)
ax.set_title('Distribution de '+str(var), fontsize=18)
liste_var = cols_quant_cont
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,30),constrained_layout=False)
i = 331
for var in liste_var :
var_hist(var, i)
i+=1
plt.savefig("assets/graphiques/analyse univariee histo_dfComplet.jpg")
# Representation graphique des outliers:
a = 3 # nombre de lignes
b = 3 # nombre de colonnes
c = 1 # initialisation
fig = plt.figure(figsize=(20,8))
for i in df.loc[:, cols_quant_cont]: # pour toute les colonnnes quantatives
plt.subplot(a, b, c) # maillage des subplot
plt.title('{} (boxplot)'.format(i, a, b, c))# titres des box plot
plt.xlabel(i) # xlabel = nom de la colonne
sns.boxplot(x = df[i]) # faire un boxplot sns
c = c + 1 # incrementation ==> création d'un nouveau box plot
plt.subplots_adjust(left=0.125, # gerer les espacements
bottom=0.1,
right=0.9,
top=0.9,
wspace=0.2,
hspace=0.35)
ANALYSE :
# On s'occupe ici uniquement des nutrigrades complétés
df_nutri = df[~(df['nutrition_grade_fr']=='0')]
# On s'occupe ici uniquement des nutrigrades complétés
df_nutriscore = df[~(df['nutrition_score_fr_100g']=='0')]
df_nutriscore = df_nutriscore[~(df_nutriscore['nutrition_grade_fr']=='0')]
# Courbe de distribution du nutriscore
plt.figure(figsize=(12, 8))
sns.histplot(df_nutriscore['nutrition_score_fr_100g'], kde=True,
color='SteelBlue', label='Nutri_score pour 100g de produit')
plt.title("Distribution du nutri-score", fontsize=14)
plt.xlim(-15, 40)
plt.xlabel('Score', fontsize=12)
plt.ylabel('Nombre de produits par score', fontsize=12)
plt.legend()
plt.show()
fig = plt.figure(figsize=(15, 6))
ax1 = fig.add_subplot(1, 2, 1)
box = sns.boxplot(data=df_nutri['nutrition_score_fr_100g'], color='SteelBlue', ax=ax1)
# box.set(ylabel=unite)
plt.grid(False)
ax2 = fig.add_subplot(1, 2, 2)
ax2 = sm.qqplot(df_nutri['nutrition_score_fr_100g'],
line='r', ax=ax2)
plt.grid(False)
fig.suptitle('Dispersion des nutrition-score-fr_100g', fontweight='bold', size=14)
plt.show()
col = ['nutrition_score_fr_100g']
tools.stat_descriptives(df_nutri,col)
| Desc | nutrition_score_fr_100g |
|---|---|
| mean | 9.131191 |
| median | 10.000000 |
| var | 81.995724 |
| std | 9.055149 |
| skew | 0.116733 |
| kurtosis | -1.017174 |
| mode | 0 0.0 |
| Min | -15.000000 |
| Max | 40.000000 |
# définition des bacs
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.IntervalIndex.from_tuples.html
liste_bins = pd.IntervalIndex.from_tuples(
[(-15, -1), (0, 2), (3, 10), (10, 18), (19, 40)])
tools.distribution_variables_plages_perc_donnees(df_nutri,'nutrition_score_fr_100g',liste_bins)
| Plage | nb_données | %_données |
|---|---|---|
| (-15, -1] | 33037 | 16.241980 |
| (0, 2] | 20121 | 9.892087 |
| (3, 10] | 35405 | 17.406160 |
| (10, 18] | 58204 | 28.614832 |
| (19, 40] | 32121 | 15.791647 |
Bilan A COMPLETER
# On visualise le nombre de valeurs uniques contenu dans les colonnes de type object
for col in df.select_dtypes('object'):
print(f'{col:-<50} {df[col].nunique()}')
code---------------------------------------------- 240304 creator------------------------------------------- 2477 product_name-------------------------------------- 186889 brands-------------------------------------------- 46265 categories_fr------------------------------------- 16277 countries_fr-------------------------------------- 81 additives_fr-------------------------------------- 38480 nutrition_grade_fr-------------------------------- 6 main_category_fr---------------------------------- 2322
# Variables qualitatives ou modalités
# Variables qualitatives nominales
cols_qual_nom = ['code','creator','product_name','brands',
'categories_fr','main_category_fr', 'countries_fr','additives_fr']
def top_N_pie (df,var,name,n,taille,perc) :
'''
Fonction qui visualise les n plus grand d'une colonne avec ou sans pourcentage
parametres :
df
var : colonne ciblée
name : 'nom de la colonne '
n : nombre de top voulu
taille : taille du pieplot
per : si True : affiche les pourcentages
'''
target = df.groupby(by=var)['code'].nunique().sort_values(ascending=False)
# Graphiques top N
fig, ax = plt.subplots(figsize=(taille, taille), subplot_kw=dict(aspect="equal"))
explodes = np.zeros(n)
explodes[0] = .1
# calcul des pourcentages
if perc:
def pct_tot(pct):
tot = round(pct*target[:n].sum(),0)
tot_pct = tot/target.sum()
return "{:.1f}%\n({:.0f})".format(tot_pct,(tot/100))
plt.pie(target[:n], labels=target[:n].index,
startangle=45,
shadow=True,
autopct=lambda pct: pct_tot(pct),
explode=explodes,
textprops=dict(color="black",size=12, weight="bold"))
else :
plt.pie(target[:n], labels=target[:n].index,
startangle=45,
shadow=True,
explode=explodes,
textprops=dict(color="black",size=10, weight="bold"))
plt.title(f"TOP {n} : {name}",fontweight='bold',fontsize=24)
plt.show()
# Nombre de créateurs, sources des données
print(f"Nombre de sources unique : {df['creator'].nunique()}")
Nombre de sources unique : 2477
top_N_pie(df,'creator','Contributeurs',5,12,True)
plt.savefig("assets/graphiques/Top_Contributeurs.jpg")
<Figure size 640x480 with 0 Axes>
df['brands'].nunique()
46265
# On s'occupe ici uniquement des catégories renseignées
df_brands = df[~(df['brands']=='inconnue')]
df_brands.shape
(237159, 23)
top_N_pie(df_brands,'brands','Marques',10,8,True)
plt.savefig("assets/graphiques/Top_Marques.jpg")
<Figure size 640x480 with 0 Axes>
# Tableau fréquences
dico = df_brands.groupby('brands')['brands'].count().sort_values(ascending=False).to_dict()
nom = 'brands'
col1 = 'Nom_' + nom
col2 = 'Nbr_' + nom
col3 = 'Fréquence (%)'
df_gpe = pd.DataFrame(dico.items(), columns=[col1, col2])
df_gpe[col3] = (df_gpe[col2] * 100) / len(df_brands)
df_gpe.head(10)
| Nom_brands | Nbr_brands | Fréquence (%) | |
|---|---|---|---|
| 0 | Carrefour | 2362 | 0.995956 |
| 1 | Auchan | 1768 | 0.745491 |
| 2 | Meijer | 1702 | 0.717662 |
| 3 | U | 1681 | 0.708807 |
| 4 | Kroger | 1454 | 0.613091 |
| 5 | Leader Price | 1372 | 0.578515 |
| 6 | Casino | 1233 | 0.519904 |
| 7 | Ahold | 1181 | 0.497978 |
| 8 | Roundy's | 1112 | 0.468884 |
| 9 | Spartan | 1063 | 0.448223 |
# Wordecloud
from wordcloud import WordCloud
wordcloud = WordCloud(width=800,height=400, background_color="white",max_words=100).generate_from_frequencies(dico)
plt.figure(figsize=(12, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
df_gp_red = df_gpe.head(10)
sns.set_style("whitegrid")
plt.figure(figsize=(8, 4))
sns.barplot(
y=df_gp_red[col1],
x=df_gp_red[col3],
data=df_gp_red,
color='SteelBlue')
plt.title('Répartition de la présence des marques dans le jeu de données')
plt.grid(False)
plt.tight_layout()
plt.show()
df['categories_fr'].nunique()
16277
# On s'occupe ici uniquement des catégories renseignées
df_categ = df[~(df['categories_fr']=='inconnu')]
tools.affiche_wordcloud_tabfreq(df_categ,'categories_fr','categories',)
| Nom_categories | Nbr_categories | Fréquence (%) |
|---|---|---|
| Snacks sucrés,Biscuits et gâteaux,Biscuits | 691 | 1.125682 |
| Snacks sucrés,Chocolats,Chocolats noirs | 529 | 0.861774 |
| Aliments et boissons à base de végétaux,Aliments d'origine végétale,Petit-déjeuners,Céréales et pommes de terre,Céréales et dérivés,Céréales pour petit-déjeuner | 450 | 0.733078 |
| Snacks sucrés,Biscuits et gâteaux,Biscuits,Biscuits au chocolat | 409 | 0.666287 |
| Snacks salés,Apéritif,Biscuits apéritifs | 404 | 0.658141 |
| Snacks sucrés,Confiseries,Bonbons | 374 | 0.609269 |
| Snacks sucrés,Chocolats,Chocolats au lait | 363 | 0.591350 |
| Produits laitiers,Yaourts | 349 | 0.568543 |
| Snacks sucrés,Chocolats | 301 | 0.490348 |
| Epicerie,Sauces | 299 | 0.487090 |
tools.affiche_wordcloud_tabfreq(df_categ,'main_category_fr','Main categories',affword=False)
| Nom_Main categories | Nbr_Main categories | Fréquence (%) |
|---|---|---|
| Boissons | 2290 | 3.730553 |
| Epicerie | 2278 | 3.711004 |
| Aliments et boissons à base de végétaux | 2274 | 3.704488 |
| Chocolats | 2252 | 3.668649 |
| Conserves | 2014 | 3.280932 |
| Biscuits | 1853 | 3.018653 |
| Plats préparés | 1813 | 2.953490 |
| Surgelés | 1713 | 2.790584 |
| Petit-déjeuners | 1566 | 2.551112 |
| Snacks sucrés | 1515 | 2.468030 |
Parmi les catégories les plus représentés ont retrouve beaucoup de produits considérés comme a surveiller dans une alimentation saine.
A défaut d'une sur représentation des produits sain on peut exploiter ces informations nombreuses pour informer le consommateur sur les produits à surveiller. Une bonne alimentation passe aussi par le plaisir et ne doit pas être stigmatisé sans avis médical personnalisé contraire. Pour notre appli cette source de données est importantes et l'application devra signaler les choses de manière pédagogique
# On s'occupe ici uniquement des catégories renseignées
df_additives = df[~(df['additives_fr']=='-1')]
top_N_pie(df_additives,'additives_fr','Principaux additifs présents',5,8,True)
plt.savefig("assets/graphiques/Top_additives.jpg")
<Figure size 640x480 with 0 Axes>
# On s'occupe ici uniquement des additifs ciblés
df_additives_target = df_additives.copy()
df_additives_target = df_additives_target[df_additives_target['additives_fr'].str.contains("338|339|340|341|343|450|451|452")]
tools.affiche_wordcloud_tabfreq(df_additives_target,'additives_fr','Additives',affword=False)
| Nom_Additives | Nbr_Additives | Fréquence (%) |
|---|---|---|
| E452vi - Tripolyphosphate de sodium et de potassium | 345 | 1.564271 |
| E339iii - Phosphate de sodium tribasique,E316 - Erythorbate de sodium,E250 - Nitrite de sodium | 290 | 1.314895 |
| E339iii - Phosphate de sodium tribasique | 286 | 1.296758 |
| E339 - Orthophosphates de sodium | 211 | 0.956699 |
| E339 - Orthophosphates de sodium,E316 - Erythorbate de sodium,E250 - Nitrite de sodium | 194 | 0.879619 |
| E450 - Sels métalliques de diphosphates | 178 | 0.807073 |
| E325 - Lactate de sodium,E339 - Orthophosphates de sodium,E262ii,E316 - Erythorbate de sodium,E250 - Nitrite de sodium | 177 | 0.802539 |
| E325 - Lactate de sodium,E339iii - Phosphate de sodium tribasique,E262ii,E316 - Erythorbate de sodium,E250 - Nitrite de sodium | 175 | 0.793471 |
| E375 - Acide nicotinique,E101 - Riboflavine,E450 - Sels métalliques de diphosphates | 159 | 0.720925 |
| E341iii - Phosphate de tricalcium | 132 | 0.598504 |
# Variables qualitatives ordinales
cols_qual_ord = ['nutriscore_grade_fr']
# On s'occupe ici uniquement des nutrigrades complétés
df_nutri = df[~(df['nutrition_grade_fr']=='0')]
nutrition_grade = df_nutri.groupby(by='nutrition_grade_fr')['code'].nunique().sort_values(ascending=False)
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(aspect="equal"))
explodes = np.zeros(5)
explodes[0] = .1
plt.pie(nutrition_grade, labels=nutrition_grade.index,
startangle=0,
colors=['#ee8100','#fecb02','#e63e11','#038141','#85bb2f'],
shadow=True,
explode=explodes,
autopct='%1.1f%%',
textprops=dict(color="black",size=12, weight="bold"))
plt.title("Répartition des Nutrition_grade", fontdict=font_title)
plt.savefig("assets/graphiques/Répartion_nutrigrdes.jpg")
plt.show()
# Répartition des variables quantitative en fonction du nutrigrade
colors_nutri = ['#038141','#85bb2f','#fecb02','#ee8100','#e63e11']
fig = plt.figure(figsize=(20, 35))
for i, c in enumerate(df.select_dtypes('float'), 1):
ax = fig.add_subplot(6, 2, i)
sns.boxplot(data=df_nutri, x='nutrition_grade_fr', y=c,order='abcde', ax=ax,palette=colors_nutri)
plt.grid(False)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.suptitle('Répartition des variables quantitatives en fonction du nutrigrade', fontsize=30)
plt.savefig("assets/graphiques/Répartition des variables quantitatives en fonction du nutrigrade.jpg")
plt.show()
sns.pairplot(df_nutri.sample(frac=0.05), hue="nutrition_grade_fr")
plt.savefig("assets/graphiques/Pairplot_Nutrition grade.jpg")
sns.clustermap(df_nutri.corr(),annot=True)
<seaborn.matrix.ClusterGrid at 0x2274b6f9f30>
Bilan
# On s'occupe ici uniquement des nutrigrades complétés
df_nutriscore = df[~(df['nutrition_score_fr_100g']=='0')]
df_nutriscore = df_nutriscore[~(df_nutriscore['nutrition_grade_fr']=='0')]
# graph
sns.histplot(data=df_nutriscore.sort_values("nutrition_grade_fr"), x="nutrition_score_fr_100g", hue="nutrition_grade_fr")
plt.show()
fig, axes = plt.subplots(1, 2, sharex=False, sharey=False, figsize=(21,8))
fig.suptitle(r"Répartition des scores Nutriscore et de leurs grades" "\n", fontsize=22)
sns.histplot(data=df_nutriscore.sort_values("nutrition_grade_fr"), x="nutrition_grade_fr", hue="nutrition_grade_fr", ax=axes[0])
axes[0].set_title('Grades de Nutriscores')
axes[0].set_xlabel("nutrition_grade_fr")
axes[0].set_ylabel("Nombre de produits")
sns.histplot(data=df_nutriscore.sort_values("nutrition_grade_fr"), x="nutrition_score_fr_100g", hue="nutrition_grade_fr", ax=axes[1])
axes[1].set_title('Scores de Nutriscores')
axes[1].set_xlabel("Score Nutriscore")
axes[1].set_ylabel("Nombre de produits")
plt.show()
df_nutri.columns
Index(['code', 'creator', 'created_datetime', 'last_modified_datetime',
'product_name', 'brands', 'categories_fr', 'countries_fr',
'additives_n', 'additives_fr', 'ingredients_from_palm_oil_n',
'nutrition_grade_fr', 'main_category_fr', 'energy_100g', 'fat_100g',
'saturated_fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
'proteins_100g', 'salt_100g', 'sodium_100g', 'nutrition_score_fr_100g'],
dtype='object')
df_nutri.groupby(['nutrition_grade_fr']).size().plot(kind='bar',)
<AxesSubplot: xlabel='nutrition_grade_fr'>
df_nutri.groupby(['nutrition_grade_fr']).size()
nutrition_grade_fr a 32971 b 31373 c 42096 d 57653 e 39312 dtype: int64
import statsmodels.formula.api as smf
import statsmodels.api as sm
anova_nutrigrade = smf.ols('salt_100g~nutrition_grade_fr', data=df_nutri).fit()
print(anova_nutrigrade.summary())
OLS Regression Results
==============================================================================
Dep. Variable: salt_100g R-squared: 0.020
Model: OLS Adj. R-squared: 0.020
Method: Least Squares F-statistic: 1053.
Date: Sat, 07 Jan 2023 Prob (F-statistic): 0.00
Time: 10:37:11 Log-Likelihood: -5.6608e+05
No. Observations: 203405 AIC: 1.132e+06
Df Residuals: 203400 BIC: 1.132e+06
Df Model: 4
Covariance Type: nonrobust
===========================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------------
Intercept 0.3317 0.022 15.395 0.000 0.289 0.374
nutrition_grade_fr[T.b] 0.1973 0.031 6.394 0.000 0.137 0.258
nutrition_grade_fr[T.c] 1.3756 0.029 47.811 0.000 1.319 1.432
nutrition_grade_fr[T.d] 1.3139 0.027 48.641 0.000 1.261 1.367
nutrition_grade_fr[T.e] 1.1733 0.029 40.160 0.000 1.116 1.231
==============================================================================
Omnibus: 390897.003 Durbin-Watson: 1.250
Prob(Omnibus): 0.000 Jarque-Bera (JB): 753180834.671
Skew: 15.237 Prob(JB): 0.00
Kurtosis: 299.547 Cond. No. 6.46
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
import statsmodels.formula.api as smf
import statsmodels.api as sm
anova_nutrigrade = smf.ols('proteins_100g~nutrition_grade_fr', data=df_nutri).fit()
print(anova_nutrigrade.summary())
OLS Regression Results
==============================================================================
Dep. Variable: proteins_100g R-squared: 0.028
Model: OLS Adj. R-squared: 0.028
Method: Least Squares F-statistic: 1491.
Date: Sat, 07 Jan 2023 Prob (F-statistic): 0.00
Time: 10:37:14 Log-Likelihood: -7.0958e+05
No. Observations: 203405 AIC: 1.419e+06
Df Residuals: 203400 BIC: 1.419e+06
Df Model: 4
Covariance Type: nonrobust
===========================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------------
Intercept 8.3360 0.044 191.083 0.000 8.250 8.421
nutrition_grade_fr[T.b] -3.2269 0.062 -51.651 0.000 -3.349 -3.104
nutrition_grade_fr[T.c] -1.3744 0.058 -23.593 0.000 -1.489 -1.260
nutrition_grade_fr[T.d] 0.2438 0.055 4.457 0.000 0.137 0.351
nutrition_grade_fr[T.e] 0.8275 0.059 13.988 0.000 0.712 0.943
==============================================================================
Omnibus: 91895.107 Durbin-Watson: 0.937
Prob(Omnibus): 0.000 Jarque-Bera (JB): 713257.449
Skew: 2.014 Prob(JB): 0.00
Kurtosis: 11.242 Cond. No. 6.46
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
import statsmodels.formula.api as smf
import statsmodels.api as sm
anova_nutrigrade = smf.ols('energy_100g~nutrition_grade_fr', data=df_nutri).fit()
print(anova_nutrigrade.summary())
OLS Regression Results
==============================================================================
Dep. Variable: energy_100g R-squared: 0.376
Model: OLS Adj. R-squared: 0.376
Method: Least Squares F-statistic: 3.070e+04
Date: Sat, 07 Jan 2023 Prob (F-statistic): 0.00
Time: 10:37:16 Log-Likelihood: -1.5885e+06
No. Observations: 203405 AIC: 3.177e+06
Df Residuals: 203400 BIC: 3.177e+06
Df Model: 4
Covariance Type: nonrobust
===========================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------------
Intercept 725.1980 3.284 220.833 0.000 718.762 731.634
nutrition_grade_fr[T.b] -204.7798 4.703 -43.543 0.000 -213.997 -195.562
nutrition_grade_fr[T.c] 272.1357 4.385 62.057 0.000 263.541 280.731
nutrition_grade_fr[T.d] 746.4478 4.117 181.299 0.000 738.378 754.517
nutrition_grade_fr[T.e] 1110.2484 4.453 249.329 0.000 1101.521 1118.976
==============================================================================
Omnibus: 8953.183 Durbin-Watson: 0.939
Prob(Omnibus): 0.000 Jarque-Bera (JB): 10833.178
Skew: 0.487 Prob(JB): 0.00
Kurtosis: 3.574 Cond. No. 6.46
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
sm.stats.anova_lm(anova_nutrigrade, typ=2)
| sum_sq | df | F | PR(>F) | |
|---|---|---|---|---|
| nutrition_grade_fr | 4.366278e+10 | 4.0 | 30699.743948 | 0.0 |
| Residual | 7.232152e+10 | 203400.0 | NaN | NaN |
df_acp = df_nutri.copy()
df_acp = df_acp.drop(columns=['additives_n','ingredients_from_palm_oil_n'])
# df_nutri=df_acp.copy()
# Données expression
X = df_acp .select_dtypes('number')
print('X', X.shape)
X (203405, 10)
# Etiquettes correspondantes (sous-types moléculaires)
y = df_acp ['nutrition_grade_fr']
print('y', y.shape)
y (203405,)
sort_by_mean = X.mean().sort_values(ascending=True)
X[sort_by_mean.index].plot(kind='box', figsize=(15, 5), rot=90, ylabel='Expression')
<AxesSubplot: ylabel='Expression'>
scaler = StandardScaler() # instanciation de l'objet scaler
X_scaled = scaler.fit_transform(X) # normalisation centrée-réduite
X_scaled = pd.DataFrame(X_scaled, index=X.index, columns=X.columns) # conversion du résultat en objet dataframe de pandas
X_scaled.plot(kind='box', figsize=(15, 4), rot=90, ylabel='Expression')
<AxesSubplot: ylabel='Expression'>
# Calcul de l'ACP
pca = PCA() # instanciation de l'objet pca
X_pca = pca.fit_transform(X_scaled) # réalisation de l'ACP sur les données X_scaled
# Conversion en dataframe pandas
pca_columns = ['PC' + str(c) for c in range(1, X_pca.shape[1]+1, 1)] # création d'une liste avec les noms de colonnes de PC1 à PC50
X_pca = pd.DataFrame(X_pca, index=X.index, columns=pca_columns) # création du dataframe
X_pca.head()
| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.976183 | -0.443672 | -0.429658 | -0.430514 | 1.164263 | 0.067646 | 2.041363 | 0.465917 | 0.111609 | -0.000113 |
| 1 | 0.482444 | -0.433707 | -0.053045 | 2.170992 | -0.550786 | -0.658820 | -0.160754 | 0.701215 | -0.032162 | -0.000161 |
| 2 | 2.380716 | 0.684614 | -1.944545 | 1.243508 | 0.489103 | -1.004062 | -1.416242 | 0.309401 | 0.060203 | -0.000051 |
| 6 | 0.994814 | -0.531088 | -0.153626 | 1.946883 | 0.066815 | -0.053333 | 0.043166 | 0.030669 | 0.062318 | -0.000033 |
| 10 | 1.697466 | 0.513297 | -1.071891 | 1.337685 | 0.060372 | -0.741211 | -0.559919 | -0.079997 | -0.029690 | -0.000033 |
pca.explained_variance_ratio_
array([3.16486435e-01, 2.08782529e-01, 1.73469763e-01, 1.32790701e-01,
6.75125315e-02, 4.39407177e-02, 3.03336992e-02, 1.89560606e-02,
7.72681459e-03, 7.48314104e-07])
# On peut les convertir en objet Series de pandas et présenter les valeurs en pourcentage.
explained_variance = pd.Series(dict(zip(X_pca.columns, 100.0*pca.explained_variance_ratio_)))
print(explained_variance.head())
PC1 31.648644 PC2 20.878253 PC3 17.346976 PC4 13.279070 PC5 6.751253 dtype: float64
explained_variance.plot(kind='bar', figsize=(15, 4), rot=90, ylabel='Explained variance')
<AxesSubplot: ylabel='Explained variance'>
explained_variance['PC1'] + explained_variance['PC2']
52.52689639260595
explained_variance['PC1'] + explained_variance['PC2'] + explained_variance['PC3']
69.8738727385225
# Eboulis des valeurs propres
fig = plt.figure(figsize=(8,6))
plt.bar(np.arange(len(explained_variance))+1, explained_variance)
plt.plot(np.arange(len(explained_variance))+1, explained_variance.cumsum(),c="red",marker='o')
plt.xlabel("rang de l'axe d'inertie")
plt.ylabel("pourcentage d'inertie")
plt.title("Eboulis des valeurs propres")
plt.show(block=False)
#fig.savefig('img/ACP_eboulis_valeurs-propres.png')
X_pca.plot(x='PC1', y='PC2', kind='scatter', figsize=(5, 5), color='gray')
<AxesSubplot: xlabel='PC1', ylabel='PC2'>
n_comp = len(df_acp.columns)
n = df_acp.shape[0]
p = df_acp.shape[1]
features =cols_acp
eigval= (n-1)/n*pca.explained_variance_
eigval
array([3.16486674e+00, 2.08782686e+00, 1.73469894e+00, 1.32790801e+00,
6.75125825e-01, 4.39407508e-01, 3.03337221e-01, 1.89560750e-01,
7.72682043e-02, 7.48314669e-06])
#Racine carrée des valeurs propres
sqrt_eigval = np.sqrt(eigval)
#Corrélation des variables avec les axes
covar = np.zeros((p,p))
for k in range(p):
covar[:,k] = pca.components_[k,:] * sqrt_eigval[k]
mat_cor = pd.DataFrame(np.around(covar, 2),
index=features,
columns=['COR_'+str(i + 1) for i in range(p)])
mat_cor
| COR_1 | COR_2 | COR_3 | COR_4 | COR_5 | COR_6 | COR_7 | COR_8 | COR_9 | COR_10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| energy_100g | 0.92 | -0.06 | 0.00 | 0.21 | 0.00 | -0.25 | -0.00 | 0.05 | -0.21 | -0.0 |
| fat_100g | 0.78 | 0.14 | -0.41 | -0.06 | 0.26 | -0.23 | -0.24 | 0.09 | 0.14 | 0.0 |
| saturated_fat_100g | 0.75 | 0.11 | -0.32 | -0.29 | 0.16 | 0.26 | 0.36 | 0.11 | 0.02 | -0.0 |
| carbohydrates_100g | 0.41 | -0.33 | 0.67 | 0.35 | -0.11 | -0.25 | 0.24 | -0.04 | 0.12 | 0.0 |
| sugars_100g | 0.44 | -0.37 | 0.68 | -0.14 | -0.13 | 0.29 | -0.21 | 0.21 | 0.00 | -0.0 |
| fiber_100g | 0.18 | -0.07 | -0.04 | 0.86 | 0.35 | 0.29 | -0.06 | -0.09 | 0.00 | 0.0 |
| proteins_100g | 0.25 | 0.26 | -0.51 | 0.43 | -0.64 | 0.09 | -0.01 | 0.06 | 0.04 | 0.0 |
| salt_100g | 0.00 | 0.93 | 0.37 | 0.06 | 0.04 | -0.00 | 0.01 | 0.03 | -0.00 | 0.0 |
| sodium_100g | 0.00 | 0.93 | 0.37 | 0.06 | 0.04 | -0.00 | 0.01 | 0.03 | -0.00 | -0.0 |
| nutrition_score_fr_100g | 0.83 | 0.12 | 0.14 | -0.35 | -0.14 | 0.13 | -0.10 | -0.33 | 0.01 | 0.0 |
#Cercle des corrélations
tools.display_circles(covar.T, n_comp, pca,
[(0,1), (2,3), (4,5)],
labels = features)
plt.show()
#Variable Illustrative
ivNutrigrade = df_nutri['nutrition_grade_fr'].values
#Encodage des nutrition_grades
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
ivNutrigrade = encoder.fit_transform(ivNutrigrade)
ivNutrigrade = ivNutrigrade.reshape((ivNutrigrade.shape[0],1))
#Corrélation de la variable illustrative avec les axes factoriels
corrIv = np.zeros((ivNutrigrade.shape[1],p))
for j in range(p):
for k in range(ivNutrigrade.shape[1]):
corrIv[k,j] = np.corrcoef(ivNutrigrade[:,k],X_projected[:,j])[0,1]
# Representation du nuage des individus sur le premier plan factoriel
X_projected = pca.transform(X_scaled)
tools.display_factorial_planes(X_projected,
n_comp,
pca,
[(0,1), (2,3), (4,5)],
labels=None,
alpha=0.15,
illustrative_var=y)
plt.show()
df_nutri.columns
Index(['code', 'creator', 'created_datetime', 'last_modified_datetime',
'product_name', 'brands', 'categories_fr', 'countries_fr',
'additives_n', 'additives_fr', 'ingredients_from_palm_oil_n',
'nutrition_grade_fr', 'main_category_fr', 'energy_100g', 'fat_100g',
'saturated_fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
'proteins_100g', 'salt_100g', 'sodium_100g', 'nutrition_score_fr_100g'],
dtype='object')
df_acp = df_nutri.copy()
df_acp = df_acp.drop(columns=['additives_n','ingredients_from_palm_oil_n'])
# df_nutri=df_acp.copy()
df_acp.head()
| code | creator | created_datetime | last_modified_datetime | product_name | brands | categories_fr | countries_fr | additives_fr | nutrition_grade_fr | main_category_fr | energy_100g | fat_100g | saturated_fat_100g | carbohydrates_100g | sugars_100g | fiber_100g | proteins_100g | salt_100g | sodium_100g | nutrition_score_fr_100g | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0000000004530 | usda-ndb-import | 2017-03-09 14:32:37 | 2017-03-09 14:32:37 | Banana Chips Sweetened (Whole) | inconnue | inconnu | États-Unis | d | inconnu | 2243.0 | 28.57 | 28.57 | 64.29 | 14.29 | 3.6 | 3.57 | 0.00000 | 0.000 | 14.0 | |
| 1 | 0000000004559 | usda-ndb-import | 2017-03-09 14:32:37 | 2017-03-09 14:32:37 | Peanuts | Torn & Glasser | inconnu | États-Unis | b | inconnu | 1941.0 | 17.86 | 0.00 | 60.71 | 17.86 | 7.1 | 17.86 | 0.63500 | 0.250 | 0.0 | |
| 2 | 0000000016087 | usda-ndb-import | 2017-03-09 10:35:31 | 2017-03-09 10:35:31 | Organic Salted Nut Mix | Grizzlies | inconnu | États-Unis | d | inconnu | 2540.0 | 57.14 | 5.36 | 17.86 | 3.57 | 7.1 | 17.86 | 1.22428 | 0.482 | 12.0 | |
| 6 | 0000000016124 | usda-ndb-import | 2017-03-09 10:35:11 | 2017-03-09 10:35:12 | Organic Muesli | Daddy's Muesli | inconnu | États-Unis | E123 - Amarante,E307a - Tocophérol | c | inconnu | 1833.0 | 18.75 | 4.69 | 57.81 | 15.62 | 9.4 | 14.06 | 0.13970 | 0.055 | 7.0 |
| 10 | 0000000016872 | usda-ndb-import | 2017-03-09 10:34:10 | 2017-03-09 10:34:11 | Zen Party Mix | Sunridge | inconnu | États-Unis | E100 - Curcumine | d | inconnu | 2230.0 | 36.67 | 5.00 | 36.67 | 3.33 | 6.7 | 16.67 | 1.60782 | 0.633 | 12.0 |
# Sélection des colonnes pour l'ACP
cols_acp = df_acp.select_dtypes(include=[np.number]).columns.to_list()
# Nombre de composantes
n_comp = len(cols_acp)
# Données pour l'ACP
df_acp = df[cols_acp]
# Noms affichés
names = df_nutri['product_name']
features = df_acp.columns
X = df_acp.values
# Centrage et Réduction - Mise à l'échelle
std_scaler = preprocessing.StandardScaler().fit(X)
X_scaled = std_scaler.transform(X)
# Calcul des composantes principales
pca = decomposition.PCA(n_components=n_comp)
pca.fit(X_scaled)
PCA(n_components=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA(n_components=10)
# Valeurs propres ou variances des composantes principales
val_propres = pca.explained_variance_
val_propres
array([3.15227596e+00, 2.05715700e+00, 1.80278871e+00, 1.30347449e+00,
6.94728783e-01, 4.24672919e-01, 2.95163178e-01, 1.98189330e-01,
6.98407047e-02, 1.75054720e-03])
df_acp = pd.DataFrame(pca.components_,
index=['PC'+str(i+1) for i in range(n_comp)],
columns=cols_acp).T
df_acp
| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| energy_100g | 0.522321 | -0.002808 | 0.019084 | 0.153808 | 0.044647 | -0.400813 | 0.024903 | -0.088076 | -0.729472 | -0.000104 |
| fat_100g | 0.435801 | 0.140603 | -0.296617 | -0.074467 | 0.313958 | -0.371474 | -0.355453 | -0.235470 | 0.527662 | 0.000104 |
| saturated_fat_100g | 0.423389 | 0.129965 | -0.223669 | -0.227704 | 0.152999 | 0.485580 | 0.629704 | -0.211602 | 0.038400 | -0.000124 |
| carbohydrates_100g | 0.228617 | -0.225910 | 0.521537 | 0.278629 | -0.092584 | -0.334745 | 0.483365 | 0.146043 | 0.414087 | -0.000131 |
| sugars_100g | 0.226603 | -0.220032 | 0.555247 | -0.113334 | -0.144291 | 0.349477 | -0.369457 | -0.547357 | 0.006352 | 0.000238 |
| fiber_100g | 0.110822 | -0.054930 | -0.019368 | 0.750452 | 0.435920 | 0.418133 | -0.173147 | 0.161955 | 0.008757 | -0.000060 |
| proteins_100g | 0.158961 | 0.125069 | -0.354462 | 0.416383 | -0.784358 | 0.072102 | -0.032306 | -0.154232 | 0.121756 | -0.000109 |
| salt_100g | -0.068121 | 0.643262 | 0.261039 | 0.081403 | 0.024959 | -0.020650 | 0.020697 | -0.072937 | -0.004773 | -0.707096 |
| sodium_100g | -0.067901 | 0.643330 | 0.260932 | 0.081531 | 0.024836 | -0.020637 | 0.020978 | -0.072542 | -0.004845 | 0.707117 |
| nutrition_score_fr_100g | 0.455615 | 0.132528 | 0.138134 | -0.280432 | -0.200385 | 0.224708 | -0.272901 | 0.714488 | 0.038892 | -0.000204 |
# Distribution des composantes principales de l'ACP
C = pca.transform(X_scaled)
plt.figure(figsize=(15,4))
plt.boxplot(C)
plt.title('Distribution des composantes principales')
plt.grid(False)
plt.show()
# quel est le pourcentage de variance préservée par chacune de
# nos composantes?
variances = pca.explained_variance_ratio_
variances
array([3.15226284e-01, 2.05714844e-01, 1.80278120e-01, 1.30346906e-01,
6.94725892e-02, 4.24671152e-02, 2.95161950e-02, 1.98188505e-02,
6.98404141e-03, 1.75053991e-04])
# quelle est la somme cumulée des variances?
somme_cumule_var = np.cumsum(variances)
somme_cumule_var
array([0.31522628, 0.52094113, 0.70121925, 0.83156615, 0.90103874,
0.94350586, 0.97302205, 0.9928409 , 0.99982495, 1. ])
# Quel est le nombre minimum de composantes principales pour expliquer 95% de la variance
plt.plot(somme_cumule_var)
# argmax pour > 95 %
top = np.argmax(somme_cumule_var > 0.95)
plt.axhline(y=0.95, color='r')
plt.text(2, 0.96, '>95%', color='r', fontsize=10)
plt.axvline(x=top, color='r')
plt.title('Taux cumulé de variances expliquées pour les composantes')
plt.xlabel('Nombre de composantes')
plt.ylabel('Taux cumulé des variances')
plt.show()
A partir de la 6ème composante on explique 95% de la variance On peut réduire notre jeu de données.
# Eboulis des valeurs propres
tools.display_scree_plot(pca)
n_comp = len(df_acp.columns)
n = df_acp.shape[0]
p = df_acp.shape[1]
features =cols_acp
eigval= (n-1)/n*pca.explained_variance_
eigval
array([3.16486674e+00, 2.08782686e+00, 1.73469894e+00, 1.32790801e+00,
6.75125825e-01, 4.39407508e-01, 3.03337221e-01, 1.89560750e-01,
7.72682043e-02, 7.48314669e-06])
#Racine carrée des valeurs propres
sqrt_eigval = np.sqrt(eigval)
#Corrélation des variables avec les axes
covar = np.zeros((p,p))
for k in range(p):
covar[:,k] = pca.components_[k,:] * sqrt_eigval[k]
mat_cor = pd.DataFrame(np.around(covar, 2),
index=features,
columns=['COR_'+str(i + 1) for i in range(p)])
mat_cor
| COR_1 | COR_2 | COR_3 | COR_4 | COR_5 | COR_6 | COR_7 | COR_8 | COR_9 | COR_10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| energy_100g | 0.92 | -0.06 | 0.00 | 0.21 | 0.00 | -0.25 | -0.00 | 0.05 | -0.21 | -0.0 |
| fat_100g | 0.78 | 0.14 | -0.41 | -0.06 | 0.26 | -0.23 | -0.24 | 0.09 | 0.14 | 0.0 |
| saturated_fat_100g | 0.75 | 0.11 | -0.32 | -0.29 | 0.16 | 0.26 | 0.36 | 0.11 | 0.02 | -0.0 |
| carbohydrates_100g | 0.41 | -0.33 | 0.67 | 0.35 | -0.11 | -0.25 | 0.24 | -0.04 | 0.12 | 0.0 |
| sugars_100g | 0.44 | -0.37 | 0.68 | -0.14 | -0.13 | 0.29 | -0.21 | 0.21 | 0.00 | -0.0 |
| fiber_100g | 0.18 | -0.07 | -0.04 | 0.86 | 0.35 | 0.29 | -0.06 | -0.09 | 0.00 | 0.0 |
| proteins_100g | 0.25 | 0.26 | -0.51 | 0.43 | -0.64 | 0.09 | -0.01 | 0.06 | 0.04 | 0.0 |
| salt_100g | 0.00 | 0.93 | 0.37 | 0.06 | 0.04 | -0.00 | 0.01 | 0.03 | -0.00 | 0.0 |
| sodium_100g | 0.00 | 0.93 | 0.37 | 0.06 | 0.04 | -0.00 | 0.01 | 0.03 | -0.00 | -0.0 |
| nutrition_score_fr_100g | 0.83 | 0.12 | 0.14 | -0.35 | -0.14 | 0.13 | -0.10 | -0.33 | 0.01 | 0.0 |
#Cercle des corrélations
tools.display_circles(covar.T, n_comp, pca,
[(0,1), (2,3), (4,5)],
labels = features
)
plt.show()
# Representation du nuage des individus sur le premier plan factoriel
# echantillon
X_ech_index = df_acp.loc[:, cols_acp].sample(300).index
X_ech = df_acp.loc[X_ech_index, cols_acp].values
X_proj_ech = preprocessing.StandardScaler().fit_transform(X_ech)
tools.display_factorial_planes(X_proj_ech,
n_comp,
pca,
[(0,1), (2,3)],
labels=None,
alpha=1,
illustrative_var=df_nutri.loc[X_ech_index,'nutrition_grade_fr'])
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) Cell In [200], line 13 4 X_ech = df_acp.loc[X_ech_index, cols_acp].values 5 X_proj_ech = preprocessing.StandardScaler().fit_transform(X_ech) 7 tools.display_factorial_planes(X_proj_ech, 8 n_comp, 9 pca, 10 [(0,1), (2,3)], 11 labels=None, 12 alpha=1, ---> 13 illustrative_var=df_nutri.loc[X_ech_index,'nutrition_grade_fr']) File D:\IT\Data Scientist\_env-OC\lib\site-packages\pandas\core\indexing.py:1067, in _LocationIndexer.__getitem__(self, key) 1065 if self._is_scalar_access(key): 1066 return self.obj._get_value(*key, takeable=self._takeable) -> 1067 return self._getitem_tuple(key) 1068 else: 1069 # we by definition only have the 0th axis 1070 axis = self.axis or 0 File D:\IT\Data Scientist\_env-OC\lib\site-packages\pandas\core\indexing.py:1247, in _LocIndexer._getitem_tuple(self, tup) 1245 with suppress(IndexingError): 1246 tup = self._expand_ellipsis(tup) -> 1247 return self._getitem_lowerdim(tup) 1249 # no multi-index, so validate all of the indexers 1250 tup = self._validate_tuple_indexer(tup) File D:\IT\Data Scientist\_env-OC\lib\site-packages\pandas\core\indexing.py:991, in _LocationIndexer._getitem_lowerdim(self, tup) 989 return section 990 # This is an elided recursive call to iloc/loc --> 991 return getattr(section, self.name)[new_key] 993 raise IndexingError("not applicable") File D:\IT\Data Scientist\_env-OC\lib\site-packages\pandas\core\indexing.py:1073, in _LocationIndexer.__getitem__(self, key) 1070 axis = self.axis or 0 1072 maybe_callable = com.apply_if_callable(key, self.obj) -> 1073 return self._getitem_axis(maybe_callable, axis=axis) File D:\IT\Data Scientist\_env-OC\lib\site-packages\pandas\core\indexing.py:1301, in _LocIndexer._getitem_axis(self, key, axis) 1298 if hasattr(key, "ndim") and key.ndim > 1: 1299 raise ValueError("Cannot index with multidimensional key") -> 1301 return self._getitem_iterable(key, axis=axis) 1303 # nested tuple slicing 1304 if is_nested_tuple(key, labels): File D:\IT\Data Scientist\_env-OC\lib\site-packages\pandas\core\indexing.py:1239, in _LocIndexer._getitem_iterable(self, key, axis) 1236 self._validate_key(key, axis) 1238 # A collection of keys -> 1239 keyarr, indexer = self._get_listlike_indexer(key, axis) 1240 return self.obj._reindex_with_indexers( 1241 {axis: [keyarr, indexer]}, copy=True, allow_dups=True 1242 ) File D:\IT\Data Scientist\_env-OC\lib\site-packages\pandas\core\indexing.py:1432, in _LocIndexer._get_listlike_indexer(self, key, axis) 1429 ax = self.obj._get_axis(axis) 1430 axis_name = self.obj._get_axis_name(axis) -> 1432 keyarr, indexer = ax._get_indexer_strict(key, axis_name) 1434 return keyarr, indexer File D:\IT\Data Scientist\_env-OC\lib\site-packages\pandas\core\indexes\base.py:6113, in Index._get_indexer_strict(self, key, axis_name) 6110 else: 6111 keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr) -> 6113 self._raise_if_missing(keyarr, indexer, axis_name) 6115 keyarr = self.take(indexer) 6116 if isinstance(key, Index): 6117 # GH 42790 - Preserve name from an Index File D:\IT\Data Scientist\_env-OC\lib\site-packages\pandas\core\indexes\base.py:6176, in Index._raise_if_missing(self, key, indexer, axis_name) 6173 raise KeyError(f"None of [{key}] are in the [{axis_name}]") 6175 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique()) -> 6176 raise KeyError(f"{not_found} not in index") KeyError: '[15936, 17406, 76571, 130955, 142058, 65038, 108464, 162541, 16428, 159840, 61772, 136786, 93133, 55784, 81637, 123527, 73026, 129663, 72246, 217402, 87590, 28703, 223730, 104519, 159, 444, 8967, 8492, 129649, 25000, 8885, 156262, 59744, 117255, 166765, 129356, 94456, 116618, 23556, 13009, 227047, 109184, 222765, 12899, 87735, 16433, 33731, 89418, 25091, 17448, 114646, 230697, 112465] not in index'
Kaggle
df_acp = df_nutri.copy()
df_acp = df_acp.drop(columns=['additives_n','ingredients_from_palm_oil_n'])
# df_nutri=df_acp.copy()
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#Centrage et réduction
X = df_acp.select_dtypes(include=['float64'])
X_scaled = scaler.fit_transform(X)
#Instanciation de l'ACP
pca = PCA(svd_solver='full').fit(X_scaled)
X_projected = pca.transform(X_scaled)
#Variances expliquées en % d'inertie
varexpl = pca.explained_variance_ratio_*100
#Projection de l'éboulis des valeurs propres
plt.figure(figsize=(8,4))
plt.bar(np.arange(len(varexpl))+1, varexpl)
plt.plot(np.arange(len(varexpl))+1, varexpl.cumsum(),c="red",marker='o')
plt.xlabel("rang de l'axe d'inertie")
plt.ylabel("pourcentage d'inertie")
plt.title("Eboulis des valeurs propres", fontdict=font_title)
plt.show(block=False)
print("Le premier plan factoriel couvrira une inertie de {:.2f}% et le second plan : {:.2f}%.".format(varexpl[0:2].sum(),
varexpl[0:4].sum()))
Le premier plan factoriel couvrira une inertie de 52.53% et le second plan : 83.15%.
#Espace des composantes principales
pcs = pca.components_
#Matrice des corrélations variables x facteurs
p = X.shape[1]
sqrt_valprop = np.sqrt(pca.explained_variance_)
corvar = np.zeros((p, p))
for dim in range(p):
corvar[:,dim] = pcs[dim,:] * sqrt_valprop[dim]
#on affiche pour les deux premiers plans factoriels
corr_matrix = pd.DataFrame({'feature':X.columns,'CORR_F1':corvar[:,0],'CORR_F2':corvar[:,1],
'CORR_F3':corvar[:,2], 'CORR_F4':corvar[:,3]})
corr_matrix
| feature | CORR_F1 | CORR_F2 | CORR_F3 | CORR_F4 | |
|---|---|---|---|---|---|
| 0 | energy_100g | 0.919577 | -0.055475 | 0.003847 | 0.212015 |
| 1 | fat_100g | 0.777394 | 0.143944 | -0.411599 | -0.055548 |
| 2 | saturated_fat_100g | 0.752975 | 0.107812 | -0.317509 | -0.285934 |
| 3 | carbohydrates_100g | 0.413601 | -0.334804 | 0.665704 | 0.353578 |
| 4 | sugars_100g | 0.436849 | -0.368174 | 0.683280 | -0.136313 |
| 5 | fiber_100g | 0.182201 | -0.068756 | -0.036149 | 0.861720 |
| 6 | proteins_100g | 0.252664 | 0.264638 | -0.510740 | 0.429292 |
| 7 | salt_100g | 0.001630 | 0.926159 | 0.369268 | 0.057356 |
| 8 | sodium_100g | 0.001721 | 0.926165 | 0.369262 | 0.057372 |
| 9 | nutrition_score_fr_100g | 0.830060 | 0.120236 | 0.139848 | -0.347973 |
#Variable Illustrative
ivNutrigrade = df_nutri['nutrition_grade_fr'].values
#Encodage des nutrition_grades
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
ivNutrigrade = encoder.fit_transform(ivNutrigrade)
ivNutrigrade = ivNutrigrade.reshape((ivNutrigrade.shape[0],1))
#Corrélation de la variable illustrative avec les axes factoriels
corrIv = np.zeros((ivNutrigrade.shape[1],p))
for j in range(p):
for k in range(ivNutrigrade.shape[1]):
corrIv[k,j] = np.corrcoef(ivNutrigrade[:,k],X_projected[:,j])[0,1]
def cerle_corr(pcs, n_comp, pca, axis_ranks,
labels=None, label_rotation=0,
illustrative_var_label=None, illustrative_var_corr=None):
for d1, d2 in axis_ranks:
if d2 < n_comp:
# initialisation de la figure
fig=plt.figure(figsize=(10,10))
fig.subplots_adjust(left=0.1,right=0.9,bottom=0.1,top=0.9)
ax=fig.add_subplot(111)
ax.set_aspect('equal', adjustable='box')
#détermination des limites du graphique
ax.set_xlim(-1,1)
ax.set_ylim(-1,1)
#affichage des flèches
plt.quiver(np.zeros(pcs.shape[1]), np.zeros(pcs.shape[1]),
pcs[d1,:],pcs[d2,:],
angles='xy', scale_units='xy', scale=1,
color="grey", alpha=0.5)
# et noms de variables
for i,(x,y) in enumerate(pcs[[d1,d2]].T):
plt.annotate(labels[i],(x,y),
ha='center', va='center',
fontsize='14',color="#17aafa", alpha=0.8)
#variable illustrative
if illustrative_var_label is not None :
plt.annotate(illustrative_var_label,
(illustrative_var_corr[0,d1],illustrative_var_corr[0,d2]),
color='g')
plt.quiver(np.zeros(pcs.shape[1]), np.zeros(pcs.shape[1]),
illustrative_var_corr[0,d1],illustrative_var_corr[0,d2],
angles='xy', scale_units='xy', scale=1, color="g", alpha=0.5)
#ajouter les axes
plt.plot([-1,1],[0,0],linewidth=1, color='grey', ls='--')
plt.plot([0,0],[-1,1],linewidth=1, color='grey', ls='--')
#ajouter un cercle
cercle = plt.Circle((0,0),1,color='#17aafa',fill=False)
ax.add_artist(cercle)
# nom des axes, avec le pourcentage d'inertie expliqué
plt.xlabel('F{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
plt.ylabel('F{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))
plt.title("Cercle des corrélations (F{} et F{})".format(d1+1, d2+1), fontdict=font_title)
plt.show(block=False)
cerle_corr(pcs, 4, pca, [(0,1),(2,3)], labels = np.array(X.columns),
illustrative_var_label="Nutrition_grade_fr", illustrative_var_corr = corrIv)